library(knitr)
library(tidyverse)
library(caret)
library(DMwR)
library(rpart)
library(ROCR)
library(randomForest)
library(xgboost)
library(rpart.plot)

#----------------------------------------------------------------
#' #1. Collect and Prepare the Data
#----------------------------------------------------------------

data_orig <- read_csv("case3data.csv")

data <- data_orig

data$grade <- as.factor(data$grade)
data$dropped <- as.factor(data$dropped)
data$ethnicity <- as.factor(data$ethnicity)
data$sex <- as.factor(data$sex)
data$zip<- as.factor(data$zip)
data$subsidizedLunches<- as.factor(data$subsidizedLunches)
data$sanctions <- as.factor(data$sanctions)
data$athleticSeasons <- as.factor(data$athleticSeasons)
data$year <- as.factor(data$year)
data$gpa <- round(data$gpa,3)


data %>%
  keep(is.numeric) %>%
  gather() %>%
  ggplot() +
  geom_histogram(mapping = aes(x=value,fill=key), color="black") +
  facet_wrap(~ key, scales = "free") +
  theme_minimal() + 
  theme(legend.position = 'none')

data %>%
  keep(is.factor) %>%
  gather() %>%
  group_by(key,value) %>% 
  summarise(n = n()) %>% 
  ggplot() +
  geom_bar(mapping=aes(x = value, y = n, fill=key), color="black", stat='identity') + 
  coord_flip() +
  facet_wrap(~ key, scales = "free") +
  theme_minimal() +
  theme(legend.position = 'none')

# Partition the data using caret's createDataPartition() function.

sample <- data %>% 
  filter(year!=2017)
set.seed(1234)
data.train <- sample
data.train <- SMOTE(dropped ~ ., data.frame(sample), perc.over = 200, perc.under = 250)

data_test <- data %>% 
  filter(year ==2017)

data.train <- data.train %>% 
  select(-studentID,-year)

#logistic

logit_mod <-
  glm(dropped ~ ., family = binomial(link = 'logit'), data = data.train)

#' View the results of the model.
summary(logit_mod)
## 
## Call:
## glm(formula = dropped ~ ., family = binomial(link = "logit"), 
##     data = data.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5480  -0.5183  -0.1543   0.5529   3.0792  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              7.086e+00  2.149e-01  32.969  < 2e-16 ***
## grade11thGrade           9.441e-01  1.005e-01   9.396  < 2e-16 ***
## grade12thGrade           6.342e-01  1.082e-01   5.861 4.60e-09 ***
## grade9thGrade           -9.329e-01  1.052e-01  -8.871  < 2e-16 ***
## zip15206                -1.478e-01  9.323e-02  -1.585 0.112855    
## zip15208                -2.357e-01  1.239e-01  -1.902 0.057178 .  
## zip15224                 2.110e-02  1.171e-01   0.180 0.857033    
## zip15232                 1.451e-01  1.406e-01   1.032 0.301992    
## ethnicityAsian           3.512e-01  2.558e-01   1.373 0.169834    
## ethnicityHispanic        6.025e-01  2.463e-01   2.446 0.014443 *  
## ethnicityOther           5.376e-01  1.500e-01   3.585 0.000338 ***
## ethnicityWhite           9.560e-01  1.169e-01   8.178 2.90e-16 ***
## sexM                     5.974e-02  6.730e-02   0.888 0.374666    
## gpa                     -2.519e+00  5.927e-02 -42.500  < 2e-16 ***
## subsidizedLunchesNone   -1.848e-01  7.966e-02  -2.320 0.020365 *  
## subsidizedLunchesPartly  6.946e-02  1.066e-01   0.652 0.514618    
## employmentHours         -6.308e-04  7.111e-03  -0.089 0.929320    
## hrsWifiPerWeek           6.734e-02  9.400e-03   7.164 7.86e-13 ***
## sanctionsnothing        -4.506e-01  7.912e-02  -5.695 1.23e-08 ***
## sanctionssuspended       1.021e-01  1.409e-01   0.725 0.468593    
## librarySwipesPerWeek    -3.323e-01  1.714e-02 -19.380  < 2e-16 ***
## apClasses               -3.658e-01  7.266e-02  -5.034 4.79e-07 ***
## athleticSeasons1        -7.491e-01  8.477e-02  -8.837  < 2e-16 ***
## athleticSeasons2        -3.656e-01  1.082e-01  -3.380 0.000724 ***
## athleticSeasons3         1.697e-01  2.091e-01   0.812 0.417007    
## athleticSeasons4         2.031e-01  6.017e-01   0.338 0.735668    
## athleticSeasons5        -8.536e+00  2.296e+02  -0.037 0.970347    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10246.3  on 7743  degrees of freedom
## Residual deviance:  5660.3  on 7717  degrees of freedom
## AIC: 5714.3
## 
## Number of Fisher Scoring iterations: 11
###backward
logit_mod <- step(logit_mod,direction="backward",trace=FALSE)
summary(logit_mod)
## 
## Call:
## glm(formula = dropped ~ grade + zip + ethnicity + gpa + subsidizedLunches + 
##     hrsWifiPerWeek + sanctions + librarySwipesPerWeek + apClasses + 
##     athleticSeasons, family = binomial(link = "logit"), data = data.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5574  -0.5187  -0.1551   0.5534   3.0695  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               7.109817   0.211755  33.576  < 2e-16 ***
## grade11thGrade            0.943651   0.100455   9.394  < 2e-16 ***
## grade12thGrade            0.634239   0.108193   5.862 4.57e-09 ***
## grade9thGrade            -0.929754   0.102462  -9.074  < 2e-16 ***
## zip15206                 -0.147602   0.092604  -1.594 0.110957    
## zip15208                 -0.235732   0.123398  -1.910 0.056091 .  
## zip15224                  0.018543   0.115768   0.160 0.872745    
## zip15232                  0.143855   0.140297   1.025 0.305191    
## ethnicityAsian            0.351819   0.255456   1.377 0.168445    
## ethnicityHispanic         0.603531   0.246130   2.452 0.014203 *  
## ethnicityOther            0.537346   0.149916   3.584 0.000338 ***
## ethnicityWhite            0.954976   0.116815   8.175 2.96e-16 ***
## gpa                      -2.517968   0.059223 -42.516  < 2e-16 ***
## subsidizedLunchesNone    -0.184555   0.079577  -2.319 0.020384 *  
## subsidizedLunchesPartly   0.072634   0.106533   0.682 0.495370    
## hrsWifiPerWeek            0.067266   0.009395   7.160 8.09e-13 ***
## sanctionsnothing         -0.450525   0.079116  -5.694 1.24e-08 ***
## sanctionssuspended        0.103409   0.140836   0.734 0.462797    
## librarySwipesPerWeek     -0.332091   0.017130 -19.386  < 2e-16 ***
## apClasses                -0.365173   0.072632  -5.028 4.96e-07 ***
## athleticSeasons1         -0.748415   0.084718  -8.834  < 2e-16 ***
## athleticSeasons2         -0.365732   0.108113  -3.383 0.000717 ***
## athleticSeasons3          0.173296   0.209279   0.828 0.407636    
## athleticSeasons4          0.198680   0.601364   0.330 0.741111    
## athleticSeasons5         -8.503953 229.628532  -0.037 0.970458    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10246.3  on 7743  degrees of freedom
## Residual deviance:  5661.1  on 7719  degrees of freedom
## AIC: 5711.1
## 
## Number of Fisher Scoring iterations: 11
# LOGISTIC REGRESSION
# Train the model.
data_test <- data_test %>% 
  select(-year,-studentID)
logit.pred.prob <- predict(logit_mod, data_test, type = 'response')

# Using a decision boundary of 0.5 (i.e If P(y=1|X) > 0.5 then y="Yes" else y="No").
logit.pred <- ifelse(logit.pred.prob > 0.5, "1", "0")

test <- data_test$dropped
pred <- logit.pred
prob <- logit.pred.prob

roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(as.factor(pred), test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
#kappa <- kappa2(data.frame(test, pred))$value
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc)
tree.mod <- train(dropped ~ ., data = data.train, method = "rpart")

tree.mod
## CART 
## 
## 7744 samples
##   12 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 7744, 7744, 7744, 7744, 7744, 7744, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03546832  0.8432527  0.6603159
##   0.04855372  0.8251994  0.6105566
##   0.52203857  0.7696179  0.4431317
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03546832.
tree.pred <- predict(tree.mod, data_test)

# View the Confusion Matrix.
confusionMatrix(tree.pred, data_test$dropped, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2760   27
##          1  280   53
##                                           
##                Accuracy : 0.9016          
##                  95% CI : (0.8906, 0.9118)
##     No Information Rate : 0.9744          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2246          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.66250         
##             Specificity : 0.90789         
##          Pos Pred Value : 0.15916         
##          Neg Pred Value : 0.99031         
##              Prevalence : 0.02564         
##          Detection Rate : 0.01699         
##    Detection Prevalence : 0.10673         
##       Balanced Accuracy : 0.78520         
##                                           
##        'Positive' Class : 1               
## 
# Just like before, note that we can obtain predicted classes...
head(predict(tree.mod, data_test, type = "raw"))
## [1] 0 0 0 0 0 0
## Levels: 0 1
# ...as well as the predicted probabilities (with "raw" and "prob", respectively).
head(predict(tree.mod, data_test, type = "prob"))
##           0         1
## 1 0.9351379 0.0648621
## 2 0.9351379 0.0648621
## 3 0.9351379 0.0648621
## 4 0.6503690 0.3496310
## 5 0.9351379 0.0648621
## 6 0.9351379 0.0648621
ctrl <-
  trainControl(method = "cv",
               number = 10,
               selectionFunction = "best")


grid <-
  expand.grid(
    .model = "tree",
    .trials = c(1, 5, 10, 15, 20, 25, 30, 35),
    .winnow = FALSE
  )



grid <- 
  expand.grid(
    .cp = seq(from=0.0001, to=0.005, by=0.0001)
)


set.seed(1234)
tree.mod <-
  train(
    dropped ~ .,
    data = data.train,
    method = "rpart",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )


library(rattle)
fancyRpartPlot(tree.mod$finalModel)

#----------------------------------------------------------------
#' #4. Random Forest
#----------------------------------------------------------------


grid <- expand.grid(.mtry = c(3, 6, 9,12))



ctrl <-
  trainControl(method = "cv",
               number = 3,
               selectionFunction = "oneSE")


set.seed(1234)
rf.mod <-
  train(
    dropped ~ .,
    data = data.train,
    method = "rf",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )

rf.mod
## Random Forest 
## 
## 7744 samples
##   12 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 5163, 5163, 5162 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    3    0.9189054  0.8250410
##    6    0.9342724  0.8596484
##    9    0.9359510  0.8633017
##   12    0.9373718  0.8663367
## 
## Kappa was used to select the optimal model using  the one SE rule.
## The final value used for the model was mtry = 6.
#----------------------------------------------------------------
#' #5. Extreme Gradient Boosting
#----------------------------------------------------------------

ctrl <-
  trainControl(method = "cv",
               number = 3,
               selectionFunction = "best")


grid <- expand.grid(
  nrounds = 20,
  max_depth = c(4, 6, 8),
  eta =  c(0.1, 0.3, 0.5),
  gamma = 0.01,
  colsample_bytree = 1,
  min_child_weight = 1,
  subsample = c(0.5, 1)
)

set.seed(1234)
xgb.mod <-
  train(
    dropped ~ .,
    data = data.train,
    method = "xgbTree",
    metric = "Kappa",
    trControl = ctrl,
    tuneGrid = grid
  )

xgb.mod
## eXtreme Gradient Boosting 
## 
## 7744 samples
##   12 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 5163, 5163, 5162 
## Resampling results across tuning parameters:
## 
##   eta  max_depth  subsample  Accuracy   Kappa    
##   0.1  4          0.5        0.9106413  0.8091037
##   0.1  4          1.0        0.9084452  0.8044658
##   0.1  6          0.5        0.9221339  0.8340878
##   0.1  6          1.0        0.9252328  0.8404719
##   0.1  8          0.5        0.9253619  0.8410903
##   0.1  8          1.0        0.9265235  0.8433194
##   0.3  4          0.5        0.9275577  0.8452340
##   0.3  4          1.0        0.9300109  0.8505006
##   0.3  6          0.5        0.9341434  0.8591054
##   0.3  6          1.0        0.9402123  0.8722569
##   0.3  8          0.5        0.9342723  0.8594770
##   0.3  8          1.0        0.9409867  0.8738365
##   0.5  4          0.5        0.9331099  0.8569963
##   0.5  4          1.0        0.9377585  0.8669107
##   0.5  6          0.5        0.9334974  0.8580391
##   0.5  6          1.0        0.9420202  0.8760266
##   0.5  8          0.5        0.9338844  0.8584468
##   0.5  8          1.0        0.9426655  0.8773545
## 
## Tuning parameter 'nrounds' was held constant at a value of 20
## 
## Tuning parameter 'colsample_bytree' was held constant at a value of
##  1
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 20, max_depth = 8,
##  eta = 0.5, gamma = 0.01, colsample_bytree = 1, min_child_weight = 1
##  and subsample = 1.
#----------------------------------------------------------------
#' #6. Compare Model Performance
#----------------------------------------------------------------

#' ##Logistic Regression
# Train the model.
logit.mod <-
  glm(dropped ~ ., family = binomial(link = 'logit'), data = data.train)

logit.pred.prob <- predict(logit.mod, data_test, type = 'response')

logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, "1", "0"))

test <- data_test$dropped
pred <- logit.pred
prob <- logit.pred.prob

# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- tibble(approach="Logistic Regression", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 



#' ##Classification Tree.
tree.pred <- predict(tree.mod, data_test, type = "raw")
tree.pred.prob <- predict(tree.mod, data_test, type = "prob")

test <- data_test$dropped
pred <- tree.pred
prob <- tree.pred.prob[,c("1")]

# Plot ROC Curve.
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")



# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Decision Tree", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 



##randomforest
rf.pred <- predict(rf.mod, data_test, type = "raw")
rf.pred.prob <- predict(rf.mod, data_test, type = "prob")

test <- data_test$dropped
pred <- rf.pred
prob <- rf.pred.prob[,c("1")]


# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Random Forest", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 

#xgboost

xgb.pred <- predict(xgb.mod, data_test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, data_test, type = "prob")

test <- data_test$dropped
pred <- xgb.pred
prob <- xgb.pred.prob[,c("1")]


# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")


# Get performance metrics.
accuracy <- mean(test == pred)
precision <- posPredValue(as.factor(pred), as.factor(test), positive = "1")
recall <- sensitivity(as.factor(pred), as.factor(test), positive = "1")
fmeasure <- (2 * precision * recall)/(precision + recall)
confmat <- confusionMatrix(pred, test, positive = "1")
kappa <- as.numeric(confmat$overall["Kappa"])
auc <- as.numeric(performance(roc.pred, measure = "auc")@y.values)
comparisons <- comparisons %>%
  add_row(approach="Extreme Gradient Boosting", accuracy = accuracy, fmeasure = fmeasure, kappa = kappa, auc = auc) 


#' ##Output Comparison Table.
kable(comparisons)
approach accuracy fmeasure kappa auc
Logistic Regression 0.9006410 0.2757009 0.2441860 0.9189638
Decision Tree 0.9240385 0.3680000 0.3414321 0.9655016
Random Forest 0.9339744 0.4046243 0.3801882 0.9729811
Extreme Gradient Boosting 0.9378205 0.4085366 0.3846779 0.9752673
# logistic
test <- data_test$dropped
pred <- logit.pred
prob <- logit.pred.prob

# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, main = "ROC Curve for Drop-out Rate Prediction Approaches",col=3, lwd = 2)+abline(a = 0, b = 1, lwd = 3, lty = 2, col = 1)
## integer(0)
# Plot ROC Curve.
tree.pred <- predict(tree.mod, data_test, type = "raw")
tree.pred.prob <- predict(tree.mod, data_test, type = "prob")

test <- data_test$dropped
pred <- tree.pred
prob <- tree.pred.prob[,c("1")]

# Plot ROC Curve.
# dev.off()
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=2, lwd = 2, add=TRUE)

#' ##Random Forest.
rf.pred <- predict(rf.mod, data_test, type = "raw")
rf.pred.prob <- predict(rf.mod, data_test, type = "prob")

test <- data_test$dropped
pred <- rf.pred
prob <- rf.pred.prob[,c("1")]

# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=4, lwd = 2, add=TRUE)


#' ##XGBoost.
xgb.pred <- predict(xgb.mod, data_test, type = "raw")
xgb.pred.prob <- predict(xgb.mod, data_test, type = "prob")

test <- data_test$dropped
pred <- xgb.pred
prob <- xgb.pred.prob[,c("1")]

# Plot ROC Curve.
roc.pred <- prediction(predictions = prob, labels = test)
roc.perf <- performance(roc.pred, measure = "tpr", x.measure = "fpr")
plot(roc.perf, col=5, lwd = 2, add=TRUE)


# Draw ROC legend.
legend(0.6, 0.6, c('Decision Tree','Logistic Regression',  'Random Forest', 'Extreme Gradient Boosting'), 2:5)